library('stringr')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('ggplot2')
df = read.csv('resale-flat-prices-based-on-approval-date-final dataset.csv')
df

Q1.1. Which year between 1990 and 2016 showed a rise in the selling of flats? or the maximum number of resale apartments available in a given year?

# getting year from date
df$month=as.Date(paste(df$month,"-01",sep=""))
df$year = format(as.Date(df$month, format="%Y-%m-%d"),"%Y")
df
group_by_year = df %>% group_by(year) %>% tally()
group_by_year
ggplot(group_by_year, aes(x=year, y=n,fill=year)) + 
  geom_bar(stat = "identity")+
  scale_fill_hue(c = 40) +
  scale_x_discrete(guide = guide_axis(angle = 65))+
  theme(legend.position="none")

max_resales=max(group_by_year$n)
print(paste("Max no of resales in a an year: ",max_resales))
## [1] "Max no of resales in a an year:  57782"

Q2 From 1990 to2016, which town has the most expensive selling price?

print("All Town in dataset")
## [1] "All Town in dataset"
distinct(df,town)
sum_of_resale_in_town = df %>% group_by(town) %>% summarise(total=mean(resale_price))

sum_of_resale_in_town=sum_of_resale_in_town[!is.na(sum_of_resale_in_town$total), ]

sum_of_resale_in_town
ggplot(sum_of_resale_in_town, aes(x=town, y=total, group = 1)) +
  geom_line(color = "#0099f9", size = 2) +
  geom_point(color = "#0099f9", size = 5)+
  scale_x_discrete(guide = guide_axis(angle = 65))+
  ggtitle("Most expensive Resale in a town")

max_expensive=max(sum_of_resale_in_town$total)
print(paste("Most expensive town is PUNGGOL with avg. cost: ",max_expensive))
## [1] "Most expensive town is PUNGGOL with avg. cost:  427489.646479146"

Q3. What sorts of flat models are present in the dataset? and what relation do they have with resale price?

df$flat_model = str_to_lower(df$flat_model)
distinct(df,flat_model)
group_by_flat_model = df %>% group_by(flat_model) %>% summarise(resale=mean(resale_price))

group_by_flat_model = group_by_flat_model[!is.na(group_by_flat_model$resale),]

group_by_flat_model
ggplot(group_by_flat_model, aes(x=flat_model, y=resale,fill=flat_model)) + 
  geom_bar(stat = "identity")+
  scale_fill_hue(c = 40) +
  scale_x_discrete(guide = guide_axis(angle = 65))+
  theme(legend.position="none")

Q4 What is the average, maximum, and minimum selling price of a four-bedroom flat in the dataset over the course of 1990-2016, together with comprehensive information of the town, floor area sqm, storey range, and block number?

four_bed_room = subset(df, flat_type == "4 ROOM")
four_bed_room
max_price = max(four_bed_room$resale_price)
min_price = min(four_bed_room$resale_price)
avg_price = mean(four_bed_room$resale_price)

print(paste("Max price of a four bedroom flat is: ",max_price))
## [1] "Max price of a four bedroom flat is:  990000"
print(paste("Min price of a four bedroom flat is: ",min_price))
## [1] "Min price of a four bedroom flat is:  32500"
print(paste("Avg price of a four bedroom flat is: ",avg_price))
## [1] "Avg price of a four bedroom flat is:  280633.517645056"
boxplot(four_bed_room$resale_price)

ggplot(four_bed_room, aes(x=flat_model, y=resale_price, fill=flat_model)) +
    geom_boxplot() +
  scale_fill_hue(c = 40) +
  scale_x_discrete(guide = guide_axis(angle = 65))

ggplot(four_bed_room, aes(x=year, y=resale_price,fill=floor_area_sqm)) + 
    geom_point() +
  scale_x_discrete(guide = guide_axis(angle = 65))

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
temp = select(four_bed_room,resale_price,lease_commence_date,floor_area_sqm)
myplots <- list()  # new empty list
for(i in 1:ncol(temp)){
  col <- names(temp)[i]
  ggp <- ggplot(temp, aes_string(x = col)) +
    geom_histogram(bins = 30, fill = "cornflowerblue", color = "black") +
    geom_vline(xintercept = mean(temp[[col]]), col = "red", lwd=1.5) 
  myplots[[i]] <- ggp  # add each plot into plot list
}

multiplot(plotlist = myplots, cols = 2)

Q6.Categorize which towns are more expensive and which towns are more affordable for property purchasers in the dataset

mean_of_sales_price = mean(df$resale_price,na.rm=TRUE)
sd_of_sales_price = sd(df$resale_price,na.rm = TRUE)
mean_of_sales_price
## [1] 277848.6
sum_of_resale_in_town$Category[sum_of_resale_in_town$total<=mean_of_sales_price]="Affordable"
## Warning: Unknown or uninitialised column: `Category`.
sum_of_resale_in_town$Category[sum_of_resale_in_town$total>mean_of_sales_price ]="Expensive"

sum_of_resale_in_town
ggplot(sum_of_resale_in_town, aes(x=town, y=total)) + 
    geom_point(aes(color = factor(Category), shape=factor(Category))) +
  scale_x_discrete(guide = guide_axis(angle = 65))